/*******************************************************************************
 Copyright (c) 2021-2023 Arm  Corporation All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:

   * Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.
   * Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.
   * Neither the name of Intel Corporation nor the names of its contributors
     may be used to endorse or promote products derived from this software
     without specific prior written permission.

 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

.arch armv8-a+crypto

#include "zuc_sbox.inc"

.section .data
.align	3
.type	EK_d, %object
EK_d:
.short	0x44D7, 0x26BC, 0x626B, 0x135E, 0x5789, 0x35E2, 0x7135, 0x09AF
.short	0x4D78, 0x2F13, 0x6BC4, 0x1AF1, 0x5E26, 0x3C4D, 0x789A, 0x47AC
.size	EK_d,.-EK_d

.align	16
.type	mask_S0, %object
mask_S0:
.quad	0xff00ff00ff00ff00
.size	mask_S0,.-mask_S0

.align	16
.type	mask_S1, %object
mask_S1:
.quad	0x00ff00ff00ff00ff
.size	mask_S1,.-mask_S1

.macro declare_register name:req, reg:req
.ifdef def_\name
	.unreq	\name
.endif
	.set def_\name, 0
	\name	.req	\reg
.endm

.macro FUNC_SCALAR_SAVE
	stp     x29, x30, [sp, -96]!
	stp	x19, x20, [sp, 16]
	stp	x21, x22, [sp, 32]
	stp	x23, x24, [sp, 48]
	stp	x25, x26, [sp, 64]
	stp	x27, x28, [sp, 80]
.endm

.macro FUNC_SCALAR_RESTORE
	ldp     x19, x20,[sp, 16]
	ldp     x21, x22, [sp, 32]
	ldp     x23, x24, [sp, 48]
	ldp     x25, x26, [sp, 64]
	ldp     x27, x28, [sp, 80]
	ldp     x29, x30, [sp],96
.endm

.text

#define START_FUNC(fn) .globl fn; \
        .type fn, %function; \
        .align 5; \
        fn:

#define END_FUNC(fn) .size fn,.-fn

/* Element offset in zuc_state_s context */
#define	OFFSET_FR1	(16*4)
#define	OFFSET_FR2	(17*4)
#define	OFFSET_BRC_X0	(18*4)
#define	OFFSET_BRC_X1	(19*4)
#define	OFFSET_BRC_X2	(20*4)
#define	OFFSET_BRC_X3	(21*4)

#define	MAX_ROUNDS	16

declare_register LFSR_S15, w3
declare_register LFSR_S14, w4
declare_register LFSR_S13, w5
declare_register LFSR_S11, w6
declare_register LFSR_S10, w8
declare_register LFSR_S9,  w7
declare_register LFSR_S7,  w9
declare_register LFSR_S5, w10
declare_register LFSR_S4, w11
declare_register LFSR_S2, w12
declare_register LFSR_S0, w13
declare_register BRC_X0,  w14
declare_register BRC_X1,  w15
declare_register BRC_X2,  w27
declare_register BRC_X3,  w28
declare_register wW,      w18
declare_register fR1,     w20
declare_register fR2,     w21
declare_register pD,      x22

.macro	make_u31 Rt, Ke, Ek, Iv
	eor	\Rt, \Rt, \Rt
	eor	\Rt, \Rt, \Iv
	eor	\Rt, \Rt, \Ek, lsl #8
	eor	\Rt, \Rt, \Ke, lsl #23
.endm

.macro	key_expand index
	ldrb	w3, [pKe, #(\index + 0)]
	ldrh	w4, [pD,  #((\index + 0)*2)]
	ldrb	w5, [pIv, #(\index + 0)]
	make_u31 w6, w3, w4, w5

	ldrb	w3, [pKe, #(\index + 1)]
	ldrh	w4, [pD,  #((\index + 1)*2)]
	ldrb	w5, [pIv, #(\index + 1)]
	make_u31 w7, w3, w4, w5

	stp	w6, w7, [pState, #((\index)*4)]
.endm

/*
 * BITS_REORG()
 *
 * params
 *     \N - round number
 * uses
 *     w3 = LFSR_S15
 *     w4 = LFSR_S14
 *     w5 = LFSR_S11
 *     w6 = LFSR_S9
 *     w7 = LFSR_S7
 *     w10 = LFSR_S5
 *     w11 = LFSR_S2
 *     w12 = LFSR_S0
 * return
 *     updates BRC_X0, BRC_X1, BRC_X2, BRC_X3
 */
.macro	BITS_REORG N
	ldr	LFSR_S15, [pState, ((15 + \N) % 16)*4]
	ldr	LFSR_S14, [pState, ((14 + \N) % 16)*4]
	ldr	LFSR_S11, [pState, ((11 + \N) % 16)*4]
	ldr	LFSR_S9,  [pState, (( 9 + \N) % 16)*4]
	ldr	LFSR_S7,  [pState, (( 7 + \N) % 16)*4]
	ldr	LFSR_S5,  [pState, (( 5 + \N) % 16)*4]
	ldr	LFSR_S2,  [pState, (( 2 + \N) % 16)*4]
	ldr	LFSR_S0,  [pState, (( 0 + \N) % 16)*4]

	lsr	LFSR_S15, LFSR_S15, #15
	and	LFSR_S14, LFSR_S14, #0xffff
	orr	BRC_X0, LFSR_S14, LFSR_S15, lsl #16

	lsr	LFSR_S9, LFSR_S9, #15
	orr	BRC_X1, LFSR_S9, LFSR_S11, lsl #16

	lsr	LFSR_S5, LFSR_S5, #15
	orr	BRC_X2, LFSR_S5, LFSR_S7, lsl #16

	lsr	LFSR_S0, LFSR_S0, #15
	orr	BRC_X3, LFSR_S0, LFSR_S2, lsl #16
.endm

.macro	NONLIN_FUNC CALC_W, ARCH
	declare_register wW1, w19
	declare_register wW2, w20
	declare_register wTMP,  w23
	declare_register wTMP1, w24
	declare_register wTMP2, w25
	declare_register wTMP3,	w26
	declare_register xTMP,  x23
	declare_register xTMP1, x24
	declare_register xTMP2, x25
	declare_register xTMP3,	x26

.if \CALC_W == 1
	eor	wW, BRC_X0, fR1
	add	wW, wW, fR2         // W = (BRC_X0 ^ F_R1) + F_R2
.endif
	add	wW1, BRC_X1, fR1    // W1 = F_R1 + BRC_X1
	eor	wW2, fR2, BRC_X2    // W2 = F_R2 ^ BRC_X2

	lsr	wTMP1, wW2, #16
	orr	wTMP2, wTMP1, wW1, lsl #16    // P = (W1 << 16) | (W2 >> 16)
	lsr	wTMP1, wW1, #16
	orr	wTMP3, wTMP1, wW2, lsl #16    // Q = (W2 << 16) | (W1 >> 16)

	mov	wTMP, wTMP2
	eor	wTMP2, wTMP2, wTMP, ror #30
	eor	wTMP2, wTMP2, wTMP, ror #22
	eor	wTMP2, wTMP2, wTMP, ror #14
	eor	wTMP2, wTMP2, wTMP, ror #8    // U = L1(P)

	mov	wTMP, wTMP3
	eor	wTMP3, wTMP3, wTMP, ror #24
	eor	wTMP3, wTMP3, wTMP, ror #18
	eor	wTMP3, wTMP3, wTMP, ror #10
	eor	wTMP3, wTMP3, wTMP, ror #2    // V = L2(Q)

	eor	xTMP1, xTMP2, xTMP3, lsl #32  // V || U

	mov	v0.d[0], xTMP1
	mov	v1.16b, v0.16b

	S0_compute_NEON	v1, v2, v3
.ifc \ARCH, NO_AESNI
	S1_compute_NEON_NO_AESNI v0, v2, v3, v4
.else
	S1_compute_NEON	v0, v2, v3, v4
.endif

	adrp	xTMP, mask_S1
	ldr	q2, [xTMP, #:lo12:mask_S1]
	and	v0.16b, v0.16b, v2.16b

	adrp	xTMP, mask_S0
	ldr	q2, [xTMP, #:lo12:mask_S0]
	and	v1.16b, v1.16b, v2.16b

	eor	v0.16b, v0.16b, v1.16b
	mov	fR1, v0.s[0]
	mov	fR2, v0.s[1]
.endm

.macro	LFSR_UPDT N
	declare_register xW, x18
	declare_register xTMP, x23
	declare_register xTMP1, x24
	declare_register xLFSR_S0, x13
	declare_register xLFSR_S4, x11
	declare_register xLFSR_S10, x8
	declare_register xLFSR_S13, x5
	declare_register xLFSR_S15, x3

	ldr	LFSR_S0, [pState, ((0 + \N) % 16)*4]
	ldr	LFSR_S4, [pState, ((4 + \N) % 16)*4]
	ldr	LFSR_S10, [pState, ((10 + \N) % 16)*4]
	ldr	LFSR_S13, [pState, ((13 + \N) % 16)*4]
	ldr	LFSR_S15, [pState, ((15 + \N) % 16)*4]

	// Calculate 64-bit LFSR feedback
	add	xW, xW, xLFSR_S0
	lsl	xLFSR_S0, xLFSR_S0, #8
	lsl	xLFSR_S4, xLFSR_S4, #20
	lsl	xLFSR_S10, xLFSR_S10, #21
	lsl	xLFSR_S13, xLFSR_S13, #17
	lsl	xLFSR_S15, xLFSR_S15, #15

	add	xW, xW, xLFSR_S0
	add	xW, xW, xLFSR_S4
	add	xW, xW, xLFSR_S10
	add	xW, xW, xLFSR_S13
	add	xW, xW, xLFSR_S15

	// Reduce it to 31-bit value
	mov	xTMP, xW
	and	xW, xW, #0x7FFFFFFF
	lsr	xTMP, xTMP, #31
	add	xW, xW, xTMP

	mov xTMP, xW
	and xW, xW, #0x7FFFFFFF
	lsr xTMP, xTMP, #31
	add xW, xW, xTMP

	str	wW, [pState, ((0 + \N) % 16)*4]
.endm

.macro	ZUC_INIT ARCH
	declare_register pKe, x0
	declare_register pIv, x1
	declare_register pState, x2
	declare_register xW, x18

	// save clobbered register
	FUNC_SCALAR_SAVE

	adrp	pD, EK_d
	add	pD, pD, #:lo12:EK_d

	// Expand key
	key_expand 0
	key_expand 2
	key_expand 4
	key_expand 6
	key_expand 8
	key_expand 10
	key_expand 12
	key_expand 14

	// Set R1 and R2 to zero
	eor	fR1, fR1, fR1
	eor	fR2, fR2, fR2

.set counter, 0
.rept 32
	BITS_REORG counter

	NONLIN_FUNC 1, \ARCH

	lsr	xW, xW, #1

	LFSR_UPDT counter
.set counter, (counter+1)
.endr

	// And once more, initial round from keygen phase = 33 times
	BITS_REORG 0
	NONLIN_FUNC 0, \ARCH
	eor	xW, xW, xW

	LFSR_UPDT 0

	// Save ZUC's state variables
	str	fR1, [pState, 16*4]
	str	fR2, [pState, 17*4]
	str	BRC_X0, [pState, 18*4]
	str	BRC_X1, [pState, 19*4]
	str	BRC_X2, [pState, 20*4]
	str	BRC_X3, [pState, 21*4]

	// Restore clobbered register
	FUNC_SCALAR_RESTORE
.endm

.macro	ZUC_KEYGEN ARCH, NUM_ROUNDS
	declare_register pKS, x0
	declare_register pState, x1

	// save clobbered register
	FUNC_SCALAR_SAVE

	ldr	fR1, [pState, #OFFSET_FR1]
	ldr	fR2, [pState, #OFFSET_FR2]
	ldr	BRC_X0, [pState, #OFFSET_BRC_X0]
	ldr	BRC_X1, [pState, #OFFSET_BRC_X1]
	ldr	BRC_X2, [pState, #OFFSET_BRC_X2]
	ldr	BRC_X3, [pState, #OFFSET_BRC_X3]

.set counter, 1
.rept \NUM_ROUNDS

	BITS_REORG counter
	NONLIN_FUNC 1, \ARCH

	// Store the keystream
	eor	wW, wW, BRC_X3
	str	wW, [pKS], #4    // save pkeystream

	eor	xW, xW, xW

	LFSR_UPDT counter
.set counter, (counter+1)
.endr

.if \NUM_ROUNDS == 8
	ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [pState]
	st1	{v2.16b, v3.16b}, [pState]
	st1	{v0.16b, v1.16b}, [pstate, #32]
.endif

.if \NUM_ROUNDS == 4
	ld1	{v0.16b, v1.16b, v2.16b, v3.16b}, [pState]
	st1	{v1.16b, v2.16b, v3.16b}, [pState]
	str	q0, [pstate, #48]
.endif

.if \NUM_ROUNDS == 2
	mov	xTMP, pState
	ldp	x8, x9, [xTMP], #16
	ldp	x10, x11, [xTMP], #16
	ldp	x12, x13, [xTMP], #16
	ldp	x14, x15, [xTMP]
	stp	x9, x10, [pState]
	stp	x11, x12, [pState, #16]
	stp	x13, x14, [pState, #32]
	stp	x15, x8, [pState, #48]
.endif

.if \NUM_ROUNDS == 1
	mov	xTMP, pState
	ldr	w10, [xTMP], #4
	ld1	{v0.16b, v1.16b, v2.16b}, [xTMP], #48
	ldr	w11, [xTMP], #4
	ldr	w12, [xTMP], #4
	ldr	w13, [xTMP]

	mov     xTMP, pState
	st1	{v0.16b, v1.16b, v2.16b}, [xTMP], #48
	str	w11, [xTMP], #4
	str	w12, [xTMP], #4
	str	w13, [xTMP], #4
	str	w10, [xTMP]
.endif

	// Save ZUC's state variables
	str	fR1, [pState, #OFFSET_FR1]
	str	fR2, [pState, #OFFSET_FR2]
	str	BRC_X0, [pState, #OFFSET_BRC_X0]
	str	BRC_X1, [pState, #OFFSET_BRC_X1]
	str	BRC_X2, [pState, #OFFSET_BRC_X2]
	str	BRC_X3, [pState, #OFFSET_BRC_X3]

	// Restore clobbered register
	FUNC_SCALAR_RESTORE
.endm

.macro	ZUC_KEYGEN_VAR ARCH
	declare_register pKS, x0
	declare_register pState, x1
	declare_register nRounds, x2

	// save clobbered register
	FUNC_SCALAR_SAVE

	ldr	fR1, [pState, #OFFSET_FR1]
	ldr	fR2, [pState, #OFFSET_FR2]
	ldr	BRC_X0, [pState, #OFFSET_BRC_X0]
	ldr	BRC_X1, [pState, #OFFSET_BRC_X1]
	ldr	BRC_X2, [pState, #OFFSET_BRC_X2]
	ldr	BRC_X3, [pState, #OFFSET_BRC_X3]

.set counter, 1
.rept MAX_ROUNDS

	BITS_REORG counter

	NONLIN_FUNC 1, \ARCH

	// Store the keystream
	eor	wW, wW, BRC_X3
	str	wW, [pKS], #4    // save pkeystream

	eor	xW, xW, xW

	LFSR_UPDT counter

	subs	nRounds, nRounds, #1
	b.eq	1f
.set counter, (counter+1)
.endr
1:
	// Save ZUC's state variables
	str	fR1, [pState, #OFFSET_FR1]
	str	fR2, [pState, #OFFSET_FR2]
	str	BRC_X0, [pState, #OFFSET_BRC_X0]
	str	BRC_X1, [pState, #OFFSET_BRC_X1]
	str	BRC_X2, [pState, #OFFSET_BRC_X2]
	str	BRC_X3, [pState, #OFFSET_BRC_X3]

	// Restore clobbered register
	FUNC_SCALAR_RESTORE
.endm

// Function which XOR's 16 bytes of the input buffer with 16 bytes of the
// KeyStream, placing the result in the output buffer.
// KeyStream bytes must be swapped on 32 bit boundary before this operation
.macro	xor_keystream
	declare_register pIn, x0
	declare_register pOut, x1
	declare_register pKS, x2

	ld1	{v0.16b}, [pKS]
	rev32	v0.16b, v0.16b
	ld1	{v1.16b}, [pIn]
	eor	v0.16b, v0.16b, v16.b
	st1	{v0.16b}, [pOut]
.endm

#ifndef ZUC_CIPHER_4
/*
 * extern void asm_ZucInitialization_aarch64(uint8_t* pKey, uint8_t* pIV, uint32_t * pState)
 * param[in]:
 *	x0 - pKey
 *	x1 - pIV
 *	x2 - pState
 */
START_FUNC(asm_ZucInitialization_aarch64)

	ZUC_INIT AESNI

	ret
END_FUNC(asm_ZucInitialization_aarch64)

/*
 * extern void asm_ZucInitialization_aarch64_no_aesni(uint8_t* pKey, uint8_t* pIV, uint32_t * pState)
 * param[in]:
 *	x0 - pKey
 *	x1 - pIV
 *	x2 - pState
 */
START_FUNC(asm_ZucInitialization_aarch64_no_aesni)

	ZUC_INIT NO_AESNI

	ret
END_FUNC(asm_ZucInitialization_aarch64_no_aesni)

/*
 * void asm_ZucGenKeystream4B_aarch64(void *pKeystream, ZucState_t *pState);
 *	x0 - KS (key stream pointer)
 *	x1 - STATE (state pointer)
 */
START_FUNC(asm_ZucGenKeystream4B_aarch64)

	ZUC_KEYGEN AESNI, 1

	ret
END_FUNC(asm_ZucGenKeystream4B_aarch64)

/*
 * void asm_ZucGenKeystream4B_aarcha64_no_aesni(void *pKeystream, ZucState_t *pState);
 *	x0 - KS (key stream pointer)
 *	x1 - STATE (state pointer)
 */
START_FUNC(asm_ZucGenKeystream4B_aarch64_no_aesni)

	ZUC_KEYGEN NO_AESNI, 1

	ret
END_FUNC(asm_ZucGenKeystream4B_aarch64_no_aesni)

/*
 * void asm_ZucGenKeystream8B_aarch64(void *pKeystream, ZucState_t *pState);
 *	x0 - KS (key stream pointer)
 *	x1 - STATE (state pointer)
 */
START_FUNC(asm_ZucGenKeystream8B_aarch64)

	ZUC_KEYGEN AESNI, 2

	ret
END_FUNC(asm_ZucGenKeystream8B_aarch64)

/*
 * void asm_ZucGenKeystream8B_aarcha64_no_aesni(void *pKeystream, ZucState_t *pState);
 *	x0 - KS (key stream pointer)
 *	x1 - STATE (state pointer)
 */
START_FUNC(asm_ZucGenKeystream8B_aarch64_no_aesni)

	ZUC_KEYGEN NO_AESNI, 2

	ret
END_FUNC(asm_ZucGenKeystream8B_aarch64_no_aesni)

/*
 * void asm_ZucGenKeystream16B_aarch64(uint32_t * pKeystream, uint32_t * pState);
 *	x0 - KS (key stream pointer)
 *	x1 - STATE (state pointer)
 */
START_FUNC(asm_ZucGenKeystream16B_aarch64)

	ZUC_KEYGEN AESNI, 4

	ret
END_FUNC(asm_ZucGenKeystream16B_aarch64)

/*
 * void asm_ZucGenKeystream16B_aarch64(uint32_t * pKeystream, uint32_t * pState);
 *	x0 - KS (key stream pointer)
 *	x1 - STATE (state pointer)
 */
START_FUNC(asm_ZucGenKeystream16B_aarch64_no_aesni)

	ZUC_KEYGEN NO_AESNI 4

	ret
END_FUNC(asm_ZucGenKeystream16B_aarch64_no_aesni)

/*
 * void asm_ZucGenKeystream_aarch64(uint32_t * pKeystream, uint32_t * pState,
 *                              uint64_t numRounds);
 *	x0 - KS (key stream pointer)
 *	x1 - STATE (state pointer)
 * 	x2 - NROUNDS (number of 4B rounds)
 */
START_FUNC(asm_ZucGenKeystream_aarch64)

	ZUC_KEYGEN_VAR AESNI

	ret
END_FUNC(asm_ZucGenKeystream_aarch64)

/*
 * void asm_ZucGenKeystream_aarch64_no_aesni(uint32_t * pKeystream, uint32_t * pState,
 *                                           uint64_t numRounds);
 *	x0 - KS (key stream pointer)
 *	x1 - STATE (state pointer)
 * 	x2 - NROUNDS (number of 4B rounds)
 */
START_FUNC(asm_ZucGenKeystream_aarch64_no_aesni)

	ZUC_KEYGEN_VAR NO_AESNI

	ret
END_FUNC(asm_ZucGenKeystream_aarch64_no_aesni)

#endif
